#### ---- SCRIPT 05: The purpose of this script is to summarise ideal point data availbility for guests. ---- #### 
#### ---- This is split by guest category types, nationality and show. ---- ####

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # big data wrangling
library(treemapify) # generate treemap figures
library(ggridges) # generate ridge plots

#### ---- LOAD DATA ---- ####

# Twitter user data 
user_data <- fread("user_ideal_point_data_updated.csv")

# Guest masterlist
guest_data <- fread("username_master_list_with_ideal_points.csv")

# TV guest lists
guest_lists <- list.files("guest_lists/",full.names = TRUE) %>%
  lapply(fread)

#### ---- MERGE THE GUEST LISTS TOGETHER AND MATCH TO THEIR IDEAL POINTS ---- ####

# Concatenate the individual guest lists together into one dataset
guest_lists_merged <- rbindlist(guest_lists)

# Match each guest appearance in the dataset to their corresponding ideal point and the ideal point of their
# organisation and organisation affiliates. These will only be used where an individual does not have an ideal point
# of their own.
guest_lists_merged <- left_join(guest_lists_merged,guest_data,by=c("Name" = "name"))

# Summarise the number of accounts missing individual ideal points by T.V show, category type, and nationality
guest_lists_merged$data_available <- "No Twitter Account"

guest_lists_merged$data_available <- ifelse(!is.na(guest_lists_merged$user_ideal_point), "Has Ideal Point",
                                       ifelse(guest_lists_merged$username != "", "Has Twitter, No Ideal Point", "No Twitter Account"))

#### ---- SUMMARISE MISSING DATA BY SHOW AND GUEST CATEGORY ---- ####

# Missing data overall
missing_data <- guest_lists_merged %>%
  group_by(data_available) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)

# Seperate missing data by those with and without Twitter accounts
missing_data$data_available <- factor(missing_data$data_available,levels = c("No Twitter Account",
                                                                             "Has Twitter, No Ideal Point",
                                                                             "Has Ideal Point"))
# Plot as a barchart 
guest_missing_data_all <- ggplot(missing_data, aes(x = data_available, y = count)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.7, alpha = 0.8) +
  geom_text(aes(label = paste0(round(percentage), "%"), y = count), hjust = -0.1, size = 3) +
  labs(title = "",
       x = "",
       y = "Count") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip()

ggsave("guest_missing_data_all.png",
       guest_missing_data_all,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Missing data by show

# Merge the nationality column into the category type column by duplicating the guest list data. 
# This allows missing data by guest category and nationality to compared in the same plot. 
guest_list_nationality <- guest_lists_merged 
guest_list_nationality$category_type <- guest_list_nationality$nationality_type
  
guest_lists_missing_data <- rbind(guest_lists_merged,guest_list_nationality)

guest_lists_missing_data_summary <- guest_lists_missing_data %>%
  group_by(category_type,data_available) %>%
  summarise(count = n()) %>%
  arrange(category_type) %>%
  group_by(category_type) %>%
  mutate(percentage = count / sum(count) * 100,
         cum_perc = cumsum(percentage))

# Order plot by ideal point availibility descending
guest_lists_missing_data_ordered <- guest_lists_missing_data_summary %>%
  filter(data_available == "Has Ideal Point") %>%
  arrange(desc(percentage)) %>%
  pull(category_type)

guest_lists_missing_data_summary$category_type <- factor(guest_lists_missing_data_summary$category_type, 
                                                         levels = rev(guest_lists_missing_data_ordered))

guest_lists_missing_data_summary$data_available <- factor(guest_lists_missing_data_summary$data_available,
                                                          levels = c("No Twitter Account",
                                                                     "Has Twitter, No Ideal Point",
                                                                     "Has Ideal Point"))
# Plot as a stacked bar chart
guests_missing_data_plot <- ggplot(guest_lists_missing_data_summary, aes(x = category_type, y = percentage, fill = data_available)) +
  geom_bar(stat = "identity", position = "stack", width = 0.5, alpha = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", percentage)), 
            position = position_stack(vjust = 0.5), 
            size = 2, 
            color = "black") +
  labs(title = "",
       x = "",
       y = "Percentage",
       fill = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip() +
  theme(legend.position = "top")

ggsave("guest_ideal_point_availability_by_category.png",
       guests_missing_data_plot,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# - Summarise missing data by show - #

tv_show_missing_data <- guest_lists_merged %>% 
  filter(data_available != "Has Ideal Point") 

# Recode organisation leaning category (manually coded) and summarise missing data by organisation and org leaning
tv_show_missing_data$organisation_leaning[tv_show_missing_data$organisation_leaning == "RIght"] <- "Right"
tv_show_missing_data$organisation_leaning[tv_show_missing_data$organisation_leaning == "Centre/Neutral/Independent"] <- "Centre/Neutral"
tv_show_missing_data$organisation_leaning[tv_show_missing_data$organisation_leaning == "N/A"] <- "Unk."

tv_show_missing_data_leaning <- tv_show_missing_data %>%
  group_by(Show,category_type,organisation_leaning) %>%
  summarise(count = n()) %>%
  arrange(Show) %>%
  group_by(Show) %>%
  mutate(percentage = count / sum(count) * 100,
         cum_perc = cumsum(percentage))

tv_show_missing_data_leaning$org_lean_merged <- paste(tv_show_missing_data_leaning$category_type,
                                                      tv_show_missing_data_leaning$organisation_leaning)

tv_show_missing_data_leaning_coords <- tv_show_missing_data_leaning %>% 
  select(Show,org_lean_merged,percentage)

# Order the categories by most occuring for plotting
guest_categories_ordered <- tv_show_missing_data_leaning %>%
  group_by(org_lean_merged) %>%
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  pull(org_lean_merged)

tv_show_missing_data_leaning_coords$org_lean_merged <- factor(tv_show_missing_data_leaning$org_lean_merged, 
                                                       levels = rev(guest_categories_ordered))

# Reorder shows along the plot x-axis
tv_show_missing_data_leaning_coords$Show <- factor(tv_show_missing_data_leaning_coords$Show, 
                                                   levels = c("BBC One: QT","BBC One: SwLK","BBC Two: PL",
                                                              "ITV: Peston","Channel 4: ANS","Sky: SRoS",
                                                              "GB News: CTS"))

# Plot as a gradient heatmap
missing_data_leaning_heatmap <- ggplot(tv_show_missing_data_leaning_coords, aes(Show, org_lean_merged, fill= percentage)) + 
  geom_tile() +
  labs(x = "",
       y = "",
       fill = "Percentage",
       title = "Guest Category/Org Political Leaning (Missing Data)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1),
        axis.title = element_blank(),
        panel.grid = element_blank())  + 
  scale_fill_gradientn(colours = c("white","grey","black"), values = c(0,0.1,1))

ggsave("guests_leaning_by_show_missing_data_heatmap.png",
       missing_data_leaning_heatmap,
       units="in", width=7, height=7, dpi=300,
       bg="white")


# Plot as a stacked bar chart by show
tv_show_missing_data <- guest_lists_merged %>%
  group_by(Show,data_available) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)

tv_show_missing_data$data_available <- factor(tv_show_missing_data$data_available,levels = c("No Twitter Account",
                                                                                             "Has Twitter, No Ideal Point",
                                                                                             "Has Ideal Point"))
# Order shows by data availability for benefit of plot interpretability
shows_plot_ordering <- tv_show_missing_data %>% 
  filter(data_available == "Has Ideal Point") %>%
  arrange(percentage) %>%
  pull(Show)
  
tv_show_missing_data$Show <- factor(tv_show_missing_data$Show, levels = shows_plot_ordering)

# Plot as a stacked bar chart
tv_missing_data_plot <- ggplot(tv_show_missing_data, aes(x = Show, y = percentage, fill = data_available)) +
  geom_bar(stat = "identity", position = "stack", width = 0.5, alpha = 0.8) +
  geom_text(aes(label = sprintf("%.1f%%", percentage)), 
            position = position_stack(vjust = 0.5), 
            size = 2, 
            color = "black") +
  labs(title = "",
       x = "",
       y = "Percentage",
       fill = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(size=12)) +
  coord_flip() +
  theme(legend.position = "top")

ggsave("guest_ideal_point_availability_by_show.png",
       tv_missing_data_plot,
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- END ---- ####
